#################################################################################
# Filename: 1_round2_loaddata.R
# Description: Loads raw survey data from the Round 2 study and performs cleaning
#################################################################################

### Load library dependencies 
library(tidyverse)
library(broom)
library(DeclareDesign)
library(estimatr)
library(rio)
library(ggplot2)
library(gridExtra)

### Set directory 
setwd(" < YOUR DIRECTORY HERE > ") 


### Import raw data 
rm(list=ls())
dat <- import("Round_2_raw.Rdata")


###
###Variable creation
###


dat[dat=="强烈赞同"] <- 5
dat[dat=="赞同"] <- 4
dat[dat=="既不赞同也不反对"] <- 3
dat[dat=="反对"] <- 2
dat[dat=="强烈反对"] <- 1

dat[dat=="极其强烈"] <- 5
dat[dat=="较为强烈"] <- 4
dat[dat=="有一些"] <- 3
dat[dat=="仅有一点点"] <- 2
dat[dat=="完全没有或几乎没有"] <- 1

dat[dat=="非常大"] <- 5
dat[dat=="比较大"] <- 4
dat[dat=="适中"] <- 3
dat[dat=="比较小"] <- 2
dat[dat=="非常小"] <- 1

dat[dat=="强烈同意"] <- 5
dat[dat=="同意"] <- 4
dat[dat=="中⽴"] <- 3
dat[dat=="反对"] <- 2
dat[dat=="强烈反对"] <- 1

dat[dat=="增加"] <- 3
dat[dat=="保持不变"] <- 2
dat[dat=="减少"] <- 1
dat[dat=="不知道/拒绝回答"] <- NA

dat$Q3[dat$Q3=="18-24"] <- 1
dat$Q3[dat$Q3=="25-29"] <- 2
dat$Q3[dat$Q3=="30-34"] <- 3
dat$Q3[dat$Q3=="35-39"] <- 4
dat$Q3[dat$Q3=="40-44"] <- 5
dat$Q3[dat$Q3=="45-49"] <- 6
dat$Q3[dat$Q3=="50-54"] <- 7
dat$Q3[dat$Q3=="55-59"] <- 8
dat$Q3[dat$Q3=="60-64"] <- 9
dat$Q3[dat$Q3=="65岁以上"] <- 10

dat$age <- as.numeric(dat$Q3)


#
#Make variables numeric
#

dat$Q28 <- as.numeric(dat$Q28)
dat$Q25 <- as.numeric(dat$Q25)
dat$Q53 <- as.numeric(dat$Q53)
dat$Q26 <- as.numeric(dat$Q26)
dat$Q52 <- as.numeric(dat$Q52)
dat$Q19_1 <- as.numeric(dat$Q19_1)
dat$Q19_2 <- as.numeric(dat$Q19_2)
dat$Q19_3 <- as.numeric(dat$Q19_3)
dat$Q19_4 <- as.numeric(dat$Q19_4)
dat$Q21_1 <- as.numeric(dat$Q21_1)
dat$Q21_2 <- as.numeric(dat$Q21_2)
dat$Q21_3 <- as.numeric(dat$Q21_3)
dat$Q21_4 <- as.numeric(dat$Q21_4)
dat$Q54_1 <- as.numeric(dat$Q54_1)
dat$Q54_2 <- as.numeric(dat$Q54_2)
dat$Q54_3 <- as.numeric(dat$Q54_3)
dat$Q54_4 <- as.numeric(dat$Q54_4)
dat$Q55_1 <- as.numeric(dat$Q55_1)
dat$Q55_2 <- as.numeric(dat$Q55_2)
dat$Q55_3 <- as.numeric(dat$Q55_3)
dat$Q55_4 <- as.numeric(dat$Q55_4)
dat$Q56_1 <- as.numeric(dat$Q56_1)
dat$Q56_2 <- as.numeric(dat$Q56_2)
dat$Q56_3 <- as.numeric(dat$Q56_3)
dat$Q56_4 <- as.numeric(dat$Q56_4)
dat$Q64_1 <- as.numeric(dat$Q64_1)
dat$Q64_2 <- as.numeric(dat$Q64_2)
dat$Q64_3 <- as.numeric(dat$Q64_3)
dat$Q64_4 <- as.numeric(dat$Q64_4)
dat$Q65_1 <- as.numeric(dat$Q65_1)
dat$Q65_2 <- as.numeric(dat$Q65_2)
dat$Q65_3 <- as.numeric(dat$Q65_3)
dat$Q65_4 <- as.numeric(dat$Q65_4)
dat$Q14 <- as.numeric(dat$Q14)
dat$Q69 <- as.numeric(dat$Q69)
dat$Q70 <- as.numeric(dat$Q70)
dat$Q68 <- as.numeric(dat$Q68)
dat$responsive.gov <- dat$Q68


dat$Q28_f <- as.numeric(dat$Q28_f)
dat$Q25_f <- as.numeric(dat$Q25_f)
dat$Q53_f <- as.numeric(dat$Q53_f)
dat$Q26_f <- as.numeric(dat$Q26_f)
dat$Q52_f <- as.numeric(dat$Q52_f)
dat$Q19_1_f <- as.numeric(dat$Q19_1_f)
dat$Q19_2_f <- as.numeric(dat$Q19_2_f)
dat$Q19_3_f <- as.numeric(dat$Q19_3_f)
dat$Q19_4_f <- as.numeric(dat$Q19_4_f)
dat$Q21_1_f <- as.numeric(dat$Q21_1_f)
dat$Q21_2_f <- as.numeric(dat$Q21_2_f)
dat$Q21_3_f <- as.numeric(dat$Q21_3_f)
dat$Q21_4_f <- as.numeric(dat$Q21_4_f)
dat$Q54_1_f <- as.numeric(dat$Q54_1_f)
dat$Q54_2_f <- as.numeric(dat$Q54_2_f)
dat$Q54_3_f <- as.numeric(dat$Q54_3_f)
dat$Q54_4_f <- as.numeric(dat$Q54_4_f)
dat$Q55_1_f <- as.numeric(dat$Q55_1_f)
dat$Q55_2_f <- as.numeric(dat$Q55_2_f)
dat$Q55_3_f <- as.numeric(dat$Q55_3_f)
dat$Q55_4_f <- as.numeric(dat$Q55_4_f)
dat$Q56_1_f <- as.numeric(dat$Q56_1_f)
dat$Q56_2_f <- as.numeric(dat$Q56_2_f)
dat$Q56_3_f <- as.numeric(dat$Q56_3_f)
dat$Q56_4_f <- as.numeric(dat$Q56_4_f)
dat$Q64_1_f <- as.numeric(dat$Q64_1_f)
dat$Q64_2_f <- as.numeric(dat$Q64_2_f)
dat$Q64_3_f <- as.numeric(dat$Q64_3_f)
dat$Q64_4_f <- as.numeric(dat$Q64_4_f)
dat$Q65_1_f <- as.numeric(dat$Q65_1_f)
dat$Q65_2_f <- as.numeric(dat$Q65_2_f)
dat$Q65_3_f <- as.numeric(dat$Q65_3_f)
dat$Q65_4_f <- as.numeric(dat$Q65_4_f)
dat$Q14_f <- as.numeric(dat$Q14_f)
dat$Q69_f <- as.numeric(dat$Q69_f)
dat$Q70_f <- as.numeric(dat$Q70_f)
dat$Q68_f <- as.numeric(dat$Q68_f)
dat$responsive.gov_f <- dat$Q68_f

dat$round2 <- 0
dat$round2[which(dat$Finished_f=="True")] <- 1

dat$clip <- factor(dat$clip, levels = c("placebo", "Japan", "HK"))


#
#Create protest variables

dat$protest.gov <- 0
dat$protest.gov[dat$Q32==""] <- NA
dat$protest.gov[grep("参加游行活动", dat$Q32)] <- 1


dat$protest.gov_f <- 0
dat$protest.gov_f[dat$Q32_f==""] <- NA
dat$protest.gov_f[grep("参加游行活动", dat$Q32_f)] <- 1


#One protest variable counts anyone who "agrees" regardless of whether they sign

#dat$protest.japan.all <- NA
#dat$protest.japan.all[dat$Q33=="愿意（请仅填写姓氏）"] <- 1
#dat$protest.japan.all[dat$Q33=="不愿意"] <- 0

#Another only counts those who 'sign' the petition

#dat$protest.japan.signed <- dat$protest.japan.all
#dat$protest.japan.signed[which(dat$Q33_1_TEXT==""&dat$protest.japan.all==1)] <- 0


dat$gender <- NA
dat$gender[dat$Q4=="女"] <- 1
dat$gender[dat$Q4=="男"] <- 0


dat$japan <- NA
dat$japan[dat$clip=="Japan"] <- 1
dat$japan[dat$clip=="placebo"] <- 0

dat$hk <- NA
dat$hk[dat$clip=="HK"] <- 1
dat$hk[dat$clip=="placebo"] <- 0

#Reverse code "warlike" (doesn't matter for PCA)
#dat$Q54_1 <- -dat$Q54_1+6
#dat$Q56_1 <- -dat$Q56_1+6


##
##Create PCA indices for each variable
##

dv.names = c("Q53", "Q56_1", "Q56_2", "Q56_3")
factor.obj = princomp(~ Q53+Q56_1+Q56_2+Q56_3, data=dat, cor=TRUE, na.action = na.omit)
dat$patriot_pca = NA
dat$patriot_pca[complete.cases(dat[,dv.names])] = as.vector(factor.obj$scores[,1])
dat$patriot_pca <- scale(dat$patriot_pca)



dv.names = c("Q53_f", "Q56_1_f", "Q56_2_f", "Q56_3_f")
factor.obj = princomp(~ Q53_f+Q56_1_f+Q56_2_f+Q56_3_f, data=dat, cor=TRUE, na.action = na.omit)
dat$patriot_pca_f = NA
dat$patriot_pca_f[complete.cases(dat[,dv.names])] = as.vector(factor.obj$scores[,1])
dat$patriot_pca_f <- scale(dat$patriot_pca_f)


dv.names = c("Q25","Q26", "Q54_1", "Q54_2", "Q54_3", "Q54_4")
factor.obj = princomp(~ Q25+Q26+Q54_1+Q54_2+Q54_3+Q54_4, data=dat, cor=TRUE, na.action = na.omit)
dat$antiforeign_japan_pca = NA
dat$antiforeign_japan_pca[complete.cases(dat[,dv.names])] = as.vector(factor.obj$scores[,1])
dat$antiforeign_japan_pca <- scale(dat$antiforeign_japan_pca)


dv.names = c("Q25_f","Q26_f", "Q54_1_f", "Q54_2_f", "Q54_3_f", "Q54_4_f")
factor.obj = princomp(~ Q25_f+Q26_f+Q54_1_f+Q54_2_f+Q54_3_f+Q54_4_f, data=dat, cor=TRUE, na.action = na.omit)
dat$antiforeign_japan_pca_f = NA
dat$antiforeign_japan_pca_f[complete.cases(dat[,dv.names])] = as.vector(factor.obj$scores[,1])
dat$antiforeign_japan_pca_f <- scale(dat$antiforeign_japan_pca_f)



dv.names = c("Q69","Q70", "Q64_1", "Q64_1", "Q64_1")
factor.obj = princomp(~ Q69+Q70+Q64_1+Q64_2+Q64_3+Q64_4, data=dat, cor=TRUE, na.action = na.omit)
dat$antiforeign_hk_pca = NA
dat$antiforeign_hk_pca[complete.cases(dat[,dv.names])] = as.vector(factor.obj$scores[,1])
dat$antiforeign_hk_pca <- scale(dat$antiforeign_hk_pca)


dv.names = c("Q54_1", "Q54_2", "Q54_3", "Q54_4","Q55_1", "Q55_2", "Q55_3", "Q55_4","Q65_1", "Q65_2", "Q65_3", "Q65_4")
factor.obj = princomp(~Q54_1+Q54_2+Q54_3+Q54_4+Q55_1+Q55_2+Q55_3+Q55_4+Q65_1+Q65_2+Q65_3+Q65_4, data=dat, cor=TRUE, na.action = na.omit)
dat$antiforeign_general_pca = NA
dat$antiforeign_general_pca[complete.cases(dat[,dv.names])] = as.vector(factor.obj$scores[,1])
dat$antiforeign_general_pca <- scale(dat$antiforeign_general_pca)


dv.names = c("Q69","Q65_1", "Q65_2", "Q65_3", "Q65_4", "Q70", "Q64_1", "Q64_2", "Q64_3", "Q64_4")
factor.obj = princomp(~ Q69+Q70+Q64_1+Q64_2+Q64_3+Q64_4+Q65_1+Q65_2+Q65_3+Q65_4, data=dat, cor=TRUE, na.action = na.omit)
dat$antiforeign_hk_pca = NA
dat$antiforeign_hk_pca[complete.cases(dat[,dv.names])] = as.vector(factor.obj$scores[,1])
dat$antiforeign_hk_pca <- scale(dat$antiforeign_hk_pca)



dv.names = c("Q69","Q70","Q65_1", "Q65_2", "Q65_3", "Q65_4")
factor.obj = princomp(~Q69+Q70+Q65_1+Q65_2+Q65_3+Q65_4, data=dat, cor=TRUE, na.action = na.omit)
dat$antiforeign_usa_pca = NA
dat$antiforeign_usa_pca[complete.cases(dat[,dv.names])] = as.vector(factor.obj$scores[,1])
dat$antiforeign_usa_pca <- scale(dat$antiforeign_usa_pca)



dv.names = c("Q69_f","Q70_f","Q65_1_f", "Q65_2_f", "Q65_3_f", "Q65_4_f")
factor.obj = princomp(~Q69_f+Q70_f+Q65_1_f+Q65_2_f+Q65_3_f+Q65_4_f, data=dat, cor=TRUE, na.action = na.omit)
dat$antiforeign_usa_pca_f = NA
dat$antiforeign_usa_pca_f[complete.cases(dat[,dv.names])] = as.vector(factor.obj$scores[,1])
dat$antiforeign_usa_pca_f <- scale(dat$antiforeign_usa_pca_f)


dv.names = c("Q28", "Q26", "Q52")
factor.obj = princomp(~ Q28+Q52+Q26, data=dat, cor=TRUE, na.action = na.omit)
dat$militant_pca = NA
dat$militant_pca[complete.cases(dat[,dv.names])] = as.vector(factor.obj$scores[,1])
dat$militant_pca <- scale(dat$militant_pca)


dv.names = c("Q28_f", "Q26_f", "Q52_f")
factor.obj = princomp(~ Q28_f+Q52_f+Q26_f, data=dat, cor=TRUE, na.action = na.omit)
dat$militant_pca_f = NA
dat$militant_pca_f[complete.cases(dat[,dv.names])] = as.vector(factor.obj$scores[,1])
dat$militant_pca_f <- scale(dat$militant_pca_f)


# Some other variables


dat$urban.hukou <- 0
dat$urban.hukou[dat$Q8=="城镇"] <- 1
dat$urban.hukou[is.na(dat$Q8)] <- NA


dat$urban.residence <- 0
dat$urban.residence[dat$Q7=="大城市"] <- 1
dat$urban.residence[is.na(dat$Q7)] <- NA


dat$education <- NA
dat$education[dat$Q9=="小学或以下"] <- 1
dat$education[dat$Q9=="初中"] <- 2
dat$education[dat$Q9=="高中" ] <- 3
dat$education[dat$Q9== "大专"] <- 4
dat$education[dat$Q9=="大学本科" ] <- 5
dat$education[dat$Q9=="研究生或以上学历"] <- 6

dat$college.education <- dat$education
dat$college.education[dat$education<5] <- 0
dat$college.education[dat$education>=5] <- 1

dat$under.40 <- NA
dat$under.40[dat$age<5] <- 1
dat$under.40[dat$age>=5] <- 0

dat$income.over.200k <- NA
dat$income.over.200k[dat$income<7] <- 0
dat$income.over.200k[dat$income>6] <- 1

dat$party <- NA
dat$party[dat$Q10=="共产党员"] <- 1
dat$party[dat$Q10=="群众"] <- 0
dat$party[dat$Q10=="共青团员"] <- 0
dat$party[dat$Q10=="民主党派人士"] <- 0

dat$income <- NA
dat$income[dat$Q11=="不清楚/拒绝回答"   ] <- NA
dat$income[dat$Q11==""   ] <- NA
dat$income[dat$Q11=="少于10,000"] <- 1
dat$income[dat$Q11=="10,001~30,000元"] <- 2
dat$income[dat$Q11== "30,001-60,000 元"] <- 3
dat$income[dat$Q11=="60,001-90,000 元"] <- 4
dat$income[dat$Q11=="90,001-120,000 元"] <- 5
dat$income[dat$Q11=="120,001-200,000元"] <- 6
dat$income[dat$Q11=="高于200,000元"] <- 7




### Save the data

save(dat, file = "Round_2_clean.Rdata")

